In [103]:
#top 5k dice keywords
NUM_CLUSTERS         = 3000 # for 25k keywords and phrases
# number of cluster synonyms to map to
NUM_CLUSTER_SYNONYMS = 5
KEY_WORDS_FILE       = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt"
SYNONYMS_QRY_FILE    = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/cluster_keyword_synonym_qry.txt"
SYNONYMS_INDEX_FILE  = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/cluster_keyword_synonym_ix.txt"
PHRASES_FILE         = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt"
MODEL_FILE           = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v"
CLUSTERS_FILE        = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/%i_clusters.txt" % NUM_CLUSTERS

In [1]:
import numpy as np
#Shared
#just used to load phrases file
def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

def get_vector(item, model):
    vocab = model.vocab[item]
    vector = model.syn0[vocab.index]
    return vector

def get_norm_vector(item, model):
    if item not in model.vocab:
        return None
    # for deserialized models, the norm vectors are not stored
    vec = get_vector(item, model)
    norm = np.linalg.norm(vec)
    if norm != 0:
        return vec / norm
    return vec

In [ ]:
import time
grand_start = time.time()

In [88]:
import numpy as np
from collections import defaultdict

#functions
def is_valid_search_keyword(kw):
    q_kw = " " + kw + " "
    for wd in "(,), and , or , not , true , TRUE , false , FALSE ".split(","):
        if wd in q_kw:
            return False
    # remove queries with negations in them
    tokens = kw.split(" ")
    
    # remove single char keywords
    if len(tokens) == 1 and len(tokens[0]) == 1:
        return False
    
    if any(map(lambda t: t.strip().startswith("-"), tokens)):
        return False
    return True

def map_keyword(kw):
    return kw.replace(" ", "_")

def extract_clusters(ids, id2kwd):
    clusters = defaultdict(set)
    for kw_id, label in enumerate(ids):
        kw = id2kwd[kw_id]
        clusters[label].add(kw)
    return clusters

def extract_centroids(km_clusterer):
    lbl2centroid = dict()
    for i in range(len(km_clusterer.cluster_centers_)):
        centroid = km_clusterer.cluster_centers_[i]
        c_norm = np.linalg.norm(centroid)
        if c_norm > 0.0:
            n_centroid = centroid / c_norm
        else:
            n_centroid = centroid
        lbl2centroid[i] = n_centroid
    return lbl2centroid

def compute_cluster_similarities(kwds, kwd2id, vectors, lbl2centroid):
    kwd2cluster_sims = dict()
    for kwd in kwds:
        ix = kwd2id[kwd]
        nvec = vectors[ix]
        sims = []

        for lbl, centroid in lbl2centroid.items():
            cosine_sim = np.inner(nvec, centroid)
            sims.append((lbl,cosine_sim))
        sims = sorted(sims, key = lambda (lbl,sim): -sim)
        kwd2cluster_sims[kwd] = sims
        if len(kwd2cluster_sims) % 1000 == 0:
            print("%i computed out of %i" % (len(kwd2cluster_sims), len(all_kwds)))
    return kwd2cluster_sims

# expand at query time
# use with tfidf (on cluster labels) at index time by just mapping to cluster label
def write_most_similar_clusters(topn, kwd2cluster_sims, synonym_qry_fname, synonyn_index_fname):
    kwords = sorted(kwd2cluster_sims.keys())
    cluster_label = lambda lbl: "cluster_" + str(lbl)
    
    with open(synonym_qry_fname, "w+") as qry_f:
        for kword in kwords:
            cl_sims = kwd2cluster_sims[kword]
            # unlike the other methods, we DO want to include the first cluster here
            # as it's a cluster rather than the top 10 or top 30 keyword method
            top_clusters = cl_sims[:topn]                
            if len(top_clusters) > 0:
                qry_f.write("%s=>" % kword)
                for lbl, sim in top_clusters:                    
                    qry_f.write("%s|%f " %(cluster_label(lbl),sim))
                qry_f.write("\n")
                
    with open(synonyn_index_fname, "w+") as f:
        for kword in kwords:
            # get top cluster label
            lbl, sim = kwd2cluster_sims[kword][0]
            f.write("%s=>%s\n" % (kword, cluster_label(lbl)))

In [8]:
import gensim, time
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load(MODEL_FILE)

In [9]:
phrases = load_stop_words(PHRASES_FILE)
len(phrases)


Out[9]:
24785

In [10]:
keywords = []
un_keywords = set()
with open(KEY_WORDS_FILE) as f:
    for line in f:
        kw = line.strip()
        if len(kw) > 0 and is_valid_search_keyword(kw):
            keywords.append(kw)
print("%i keywords loaded from %s" % (len(keywords), KEY_WORDS_FILE))


4709 keywords loaded from /Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt

In [54]:
#get all keywords
# remove any not in the model
all_kwds = phrases.union(keywords)
#all_kwds = set(keywords)
for kwd in list(all_kwds):
    if kwd not in model.vocab:
        all_kwds.remove(kwd)
    splt = kwd.split(" ")
    # add in single word tokens from keywords
    if splt and len(splt) > 1:
        for wd in splt:
            if wd.strip() and wd in model.vocab:
                all_kwds.add(wd)

id2kwd = dict()
kwd2id = dict()
vectors = []
for term in all_kwds:
    id2kwd[len(vectors)] = term
    kwd2id[term] = len(vectors)
    vec = get_norm_vector(term, model)
    vectors.append(vec)

len(all_kwds), len(vectors)


Out[54]:
(25189, 25189)

In [21]:
from sklearn import cluster
from sklearn.cluster import KMeans
import time
start = time.time()

# don't parallelize (n_jobs = -1), doesn't seem to work
print("Clustering vectors into %i clusters" % NUM_CLUSTERS)
km_clusterer = KMeans(n_clusters=NUM_CLUSTERS, n_jobs=1, verbose=1, n_init=5)
ids = km_clusterer.fit_predict(vectors)

end = time.time()
print("Creating %i clusters took %i seconds" % (NUM_CLUSTERS, end - start))


Clustering vectors into 3000 clusters
Initialization complete
Iteration  0, inertia 19544.329
Iteration  1, inertia 12955.389
Iteration  2, inertia 12826.108
Iteration  3, inertia 12788.313
Iteration  4, inertia 12771.860
Iteration  5, inertia 12764.087
Iteration  6, inertia 12759.219
Iteration  7, inertia 12756.924
Iteration  8, inertia 12755.063
Iteration  9, inertia 12754.322
Iteration 10, inertia 12753.633
Iteration 11, inertia 12753.162
Iteration 12, inertia 12753.076
Iteration 13, inertia 12753.031
Iteration 14, inertia 12752.941
Iteration 15, inertia 12752.919
Iteration 16, inertia 12752.891
Converged at iteration 16
Initialization complete
Iteration  0, inertia 19562.936
Iteration  1, inertia 12963.549
Iteration  2, inertia 12835.096
Iteration  3, inertia 12793.454
Iteration  4, inertia 12775.182
Iteration  5, inertia 12764.331
Iteration  6, inertia 12757.906
Iteration  7, inertia 12754.767
Iteration  8, inertia 12753.236
Iteration  9, inertia 12752.421
Iteration 10, inertia 12752.234
Iteration 11, inertia 12752.218
Converged at iteration 11
Initialization complete
Iteration  0, inertia 19551.047
Iteration  1, inertia 12962.796
Iteration  2, inertia 12837.073
Iteration  3, inertia 12795.039
Iteration  4, inertia 12775.550
Iteration  5, inertia 12766.712
Iteration  6, inertia 12762.237
Iteration  7, inertia 12760.335
Iteration  8, inertia 12759.498
Iteration  9, inertia 12759.030
Iteration 10, inertia 12758.851
Converged at iteration 10
Initialization complete
Iteration  0, inertia 19545.103
Iteration  1, inertia 12943.239
Iteration  2, inertia 12818.721
Iteration  3, inertia 12778.609
Iteration  4, inertia 12761.095
Iteration  5, inertia 12753.191
Iteration  6, inertia 12749.494
Iteration  7, inertia 12747.253
Iteration  8, inertia 12745.256
Iteration  9, inertia 12743.906
Iteration 10, inertia 12742.885
Iteration 11, inertia 12742.287
Iteration 12, inertia 12741.858
Iteration 13, inertia 12741.410
Iteration 14, inertia 12741.126
Iteration 15, inertia 12741.004
Iteration 16, inertia 12740.937
Converged at iteration 16
Initialization complete
Iteration  0, inertia 19562.984
Iteration  1, inertia 12958.604
Iteration  2, inertia 12828.955
Iteration  3, inertia 12785.170
Iteration  4, inertia 12766.394
Iteration  5, inertia 12759.080
Iteration  6, inertia 12755.253
Iteration  7, inertia 12752.681
Iteration  8, inertia 12751.589
Iteration  9, inertia 12751.017
Iteration 10, inertia 12750.028
Iteration 11, inertia 12749.291
Iteration 12, inertia 12748.855
Iteration 13, inertia 12748.299
Iteration 14, inertia 12747.935
Iteration 15, inertia 12747.405
Iteration 16, inertia 12746.919
Iteration 17, inertia 12746.475
Iteration 18, inertia 12746.113
Iteration 19, inertia 12745.953
Converged at iteration 19

In [53]:
lbl2cluster = extract_clusters(ids, id2kwd)
lbl2centroid = extract_centroids(km_clusterer)

len(lbl2cluster), len(lbl2centroid)


Out[53]:
(3000, 3000)

In [59]:
import time
start = time.time()

kwd2cluster_sims = compute_cluster_similarities(all_kwds, kwd2id, vectors, lbl2centroid)
end = time.time()
print("Sorting the clusters for each of the %i keywords took %i seconds" % (len(all_kwds),end - start))


1000 computed out of 25189
2000 computed out of 25189
3000 computed out of 25189
4000 computed out of 25189
5000 computed out of 25189
6000 computed out of 25189
7000 computed out of 25189
8000 computed out of 25189
9000 computed out of 25189
10000 computed out of 25189
11000 computed out of 25189
12000 computed out of 25189
13000 computed out of 25189
14000 computed out of 25189
15000 computed out of 25189
16000 computed out of 25189
17000 computed out of 25189
18000 computed out of 25189
19000 computed out of 25189
20000 computed out of 25189
21000 computed out of 25189
22000 computed out of 25189
23000 computed out of 25189
24000 computed out of 25189
25000 computed out of 25189
Sorting the clusters for each of the 25189 keywords took 485 seconds

In [101]:
write_most_similar_clusters(NUM_CLUSTER_SYNONYMS, kwd2cluster_sims, SYNONYMS_QRY_FILE, SYNONYMS_INDEX_FILE)

In [ ]:
grand_end = time.time()
print("Cluster generation and processing took %i seconds" % (grand_end - grand_start))

Examine the Clusters


In [102]:
lbl2cluster.values()[0:100]


Out[102]:
[{'switches'},
 {'coding testing',
  'complete software',
  'entire software',
  'lifecycle requirements',
  'requirements gathering design'},
 {'build solutions',
  'code base',
  'code bases',
  'codebase',
  'define best',
  'deliver products',
  'existing code',
  'existing features',
  'existing ones',
  'existing product',
  'existing production',
  'existing services',
  'feature',
  'feature set',
  'fine tune',
  'flawless',
  'implement enhancements',
  'implement product',
  'legacy code',
  'migration activities',
  'necessary changes',
  'product architecture',
  'quality applications',
  'refactor',
  'refactoring',
  'reuse',
  'rewrite',
  'usefulness'},
 {'application hosting',
  'application web',
  'database web',
  'databases web',
  'enterprise technologies',
  'server setup',
  'technologies used'},
 {'bi business',
  'bi stack',
  'mdx',
  'olap',
  'sql bi',
  'ssas',
  'ssis',
  'ssis developer',
  'ssis ssas',
  'ssrs ssas'},
 {'bi lingual',
  'bilingual',
  'chinese',
  'fluent',
  'french',
  'german',
  'japanese',
  'korean',
  'lingual',
  'localized',
  'portuguese',
  'russian',
  'spanish',
  'speak',
  'speaker'},
 {'connectivity',
  'firewalls routers switches',
  'firewalls switches',
  'infrastructure environment',
  'load balancers firewalls',
  'mobile computing',
  'network components',
  'network devices',
  'network hardware',
  'network operating systems',
  'network servers',
  'networking equipment',
  'networks servers',
  'operating systems applications',
  'operating systems network',
  'platforms operating systems',
  'routers switches firewalls',
  'security hardware',
  'servers network',
  'servers networking',
  'servers storage',
  'switches routers',
  'switches routers firewalls',
  'systems servers'},
 {'detail design', 'requirements review'},
 {'company specializing',
  'consulting firm',
  'consulting organization',
  'encouraged',
  'genesis10',
  'matlen silver',
  'moment',
  'prefered',
  'providing services',
  'pwc',
  'sponsored',
  'sponsoring',
  'sponsorships'},
 {'narrative'},
 {'project progress'},
 {'attention',
  'attentive',
  'close attention',
  'compromising',
  'conscientious',
  'conscious',
  'customer oriented',
  'customer service',
  'customer service communication',
  'customer service focus',
  'customer service oriented',
  'deliver results',
  'delivering results',
  'demonstrated commitment',
  'dependability',
  'dependable',
  'detailed oriented',
  'diligence',
  'diligent',
  'do attitude',
  'ethic',
  'excellent follow',
  'extremely detail oriented',
  'good attention',
  'great customer service',
  'habits',
  'impeccable',
  'meticulous',
  'meticulous attention',
  'organized',
  'orientated',
  'outgoing',
  'outstanding customer service',
  'pay attention',
  'personality',
  'pleasant',
  'positive attitude',
  'producing quality',
  'professional appearance',
  'professional attitude',
  'professional demeanor',
  'punctual',
  'punctuality',
  'quality customer service',
  'quality results',
  'quantity',
  'sacrificing',
  'self motivated detail oriented',
  'self motivation',
  'superb',
  'superior',
  'thoroughness',
  'traits'},
 {'process definition',
  'process owner',
  'process re engineering',
  'quality tools',
  're engineer'},
 {'arrangements',
  'assistance program',
  'cafeteria',
  'child',
  'club',
  'commuter',
  'credit union',
  'discount',
  'discounted',
  'discounts',
  'eap',
  'employee assistance',
  'employee assistance program',
  'employee stock purchase',
  'fitness center',
  'flex',
  'gym membership',
  'parking',
  'passes',
  'reimbursement',
  'subsidized',
  'supplement',
  'tuition assistance',
  'vision plans',
  'wellness'},
 {'cocoa', 'swift'},
 {'create prototypes', 'explore', 'illustrate'},
 {'document system',
  'drive solutions',
  'implement processes',
  'system modifications'},
 {'enterprise performance', 'hhs'},
 {'accustomed',
  'adapt',
  'adapt quickly',
  'adaptability',
  'adaptable',
  'adapting',
  'aggressive',
  'ambiguous',
  'broad base',
  'busy',
  'challenging environment',
  'changing business',
  'changing environment',
  'changing priorities',
  'changing requirements',
  'comfortable working',
  'comfortably',
  'constantly changing',
  'demands',
  'dynamic',
  'dynamic business',
  'dynamic environment',
  'dynamic fast',
  'dynamic fast paced',
  'embrace change',
  'entrepreneurial environment',
  'environment managing',
  'ever changing',
  'extremely fast',
  'fast changing',
  'fast pace',
  'fast pace environment',
  'fast paced',
  'fast paced agile',
  'fast paced agile environment',
  'fast paced business',
  'fast paced changing',
  'fast paced deadline',
  'fast paced deadline driven',
  'fast paced demanding',
  'fast paced dynamic',
  'fast paced dynamic environment',
  'fast paced environment',
  'fast paced ever',
  'fast paced highly',
  'fast paced results',
  'flexibility',
  'flexible',
  'focused environment',
  'function effectively',
  'gears',
  'highly dynamic',
  'interrupt',
  'operate effectively',
  'operate independently',
  'paced',
  'produce results',
  'productively',
  'project environment',
  'quick turnaround',
  'quickly adapt',
  'respond quickly',
  'self motivated team player',
  'shifting',
  'shifting priorities',
  'somewhat',
  'technical environment',
  'thrive',
  'tight',
  'tight timelines',
  'turnaround',
  'whilst'},
 {'advanced system',
  'computer operating systems',
  'desktop environment',
  'desktop operating',
  'enterprise management',
  'hardware',
  'hardware software',
  'hardware systems',
  'installation configuration maintenance',
  'installation configuration operation',
  'lifecycle management',
  'network operating',
  'operating system software',
  'operating systems hardware',
  'operating systems software',
  'oss',
  'server desktop',
  'server system',
  'software operating',
  'software operating systems',
  'system environment',
  'system hardware',
  'system infrastructure',
  'system platforms',
  'system software',
  'systems operations',
  'troubleshooting software',
  'troubleshooting support',
  'windows server operating',
  'windows support'},
 {'capacity',
  'computer equipment',
  'devises',
  'general direction',
  'limitations',
  'limitations operating',
  'modifies procedures',
  'solve complex',
  'solve complex business',
  'solve complex problems',
  'system capacity',
  'system s'},
 {'4.x', '5.x', 'esxi', 'vmware esxi'},
 {'age',
  'age color',
  'age disability veteran status',
  'basis',
  'category',
  'color national origin',
  'disability age',
  'disability protected veteran status',
  'disability veteran status',
  'discriminate',
  'ethnicity',
  'expression',
  'genetic',
  'genetic information',
  'genetics',
  'identity disability',
  'information marital status',
  'legally',
  'legally protected',
  'marital',
  'marital status',
  'military',
  'military service',
  'national origin',
  'national origin ancestry',
  'national origin disability',
  'perceived',
  'prokarma',
  'protected',
  'race',
  'status age',
  'unrelated',
  'veteran status',
  'veteran status disability'},
 {'.net asp',
  'asp',
  'c# java',
  'c# vb',
  'cold fusion',
  'coldfusion',
  'software analysis',
  'vb',
  'visual basic'},
 {'application development',
  'client server application',
  'coding debugging',
  'complete sdlc',
  'complex software systems',
  'cross platform',
  'different software',
  'end software',
  'enterprise class software',
  'managing software',
  'overall software',
  'performing application',
  'production software',
  'professional software',
  'qa software testing',
  'requirements design implementation',
  'software development',
  'software integration',
  'software life cycle',
  'software quality assurance',
  'software testing',
  'systems environments',
  'systems level',
  'test qa',
  'testing programs',
  'years+'},
 {'analysis testing',
  'automated solutions',
  'coordinate system',
  'coordinate test',
  'design test plans',
  'end performance',
  'execute software',
  'execution',
  'newly developed',
  'perform test',
  'requirements testing',
  'scenarios',
  'support test',
  'support testing',
  'support user',
  'system integration',
  'system test',
  'testing activities'},
 {'dhtml', 'xml html'},
 {'batch processes',
  'build data',
  'cdc',
  'change data capture',
  'dac',
  'event processing',
  'optim',
  'oracle warehouse builder',
  'partition',
  'power center',
  'quantities'},
 {'elite', 'norcross', 'pega architect'},
 {'html5 css3 jquery'},
 {'alike',
  'alliances',
  'appropriate communication',
  'better understanding',
  'build rapport',
  'build relationships',
  'building relationships',
  'builds relationships',
  'business contacts',
  'business relationships',
  'client organization',
  'client relationships',
  'clients vendors',
  'collaborative relationships',
  'collaborative working',
  'communication collaboration',
  'company management',
  'constituent',
  'constituents',
  'cooperation',
  'cooperative',
  'cooperative working',
  'credibility',
  'cultivate',
  'cultivates',
  'cultivating',
  'customer relationships',
  'cxo',
  'develop relationships',
  'director vp',
  'effective relationships',
  'effective working',
  'effective working relationships',
  'effectively build',
  'establish credibility',
  'establish relationships',
  'excellent people',
  'good working relationships',
  'influencers',
  'information exchange',
  'internal team members',
  'interpersonal relationship',
  'maintain effective',
  'maintain effective working relationships',
  'maintain excellent',
  'maintain good',
  'maintain positive',
  'maintaining effective',
  'maintains effective',
  'maintains relationships',
  'outstanding customer',
  'positive customer',
  'positive relationships',
  'positive working',
  'productive',
  'productive relationships',
  'productive working',
  'professional business',
  'professional communication',
  'professional relationships',
  'professional working',
  'rapport',
  'relations',
  'relationships',
  'relationships throughout',
  'respectful',
  'trusting',
  'win win',
  'working relationships'},
 {'good documentation', 'microsoft excel word', 'solid working'},
 {'cordova', 'end technologies', 'js jquery'},
 {'account executives',
  'account managers',
  'account planning',
  'account teams',
  'alliance',
  'channel',
  'channel partners',
  'client partner',
  'develop customer',
  'existing clients',
  'field sales',
  'integrators',
  'internal sales',
  'market share',
  'partner relationships',
  'pre sales support',
  'pre sales technical',
  'presales',
  'product sales',
  'rep',
  'reps',
  'reseller',
  'resellers',
  'sales',
  'sales account',
  'sales director',
  'sales management',
  'sales managers',
  'sales process',
  'sales product',
  'sales representative',
  'sales representatives',
  'sales reps',
  'sales sales',
  'sales strategy',
  'sales team',
  'sales teams',
  'sales technical',
  'se',
  'services sales',
  'solutions architects',
  'strategic partners',
  'technical account',
  'technology sales',
  'territory'},
 {'attend',
  'attend meetings',
  'attend project',
  'attends',
  'chair',
  'conference calls',
  'discussion',
  'kick',
  'kickoff',
  'management meetings',
  'meeting',
  'meetings',
  'project meetings',
  'regularly scheduled',
  'review meetings'},
 {'advising',
  'champions',
  'directors',
  'leadership direction',
  'lower level',
  'mentor',
  'mentor train',
  'mentorship',
  'principals',
  'proper use',
  'provides mentoring',
  'providing technical guidance',
  'sharing knowledge',
  'training staff',
  'transfer knowledge'},
 {'creating software',
  'go',
  'languages python',
  'proven hands',
  'ruby java',
  'ruby python'},
 {'collected', 'evaluate data', 'relevance', 'validate data'},
 {'avro', 'hdfs', 'kafka', 'serialization', 'structured data', 'thrift'},
 {'basic scripting',
  'batch files',
  'dos',
  'installshield',
  'nt',
  'power shell',
  'powershell',
  'powershell perl',
  'powershell scripting',
  'powershell vbscript',
  'scripting powershell',
  'scripting technologies',
  'vb script',
  'vb scripting',
  'vbscript',
  'windows powershell',
  'windows scripting',
  'wmi'},
 {'advanced networking',
  'basic network',
  'basic networking',
  'distributed computing environment',
  'firewalls load',
  'firewalls load balancers',
  'firewalls routers',
  'general networking',
  'infrastructure knowledge',
  'ip networking',
  'network technologies',
  'network topologies',
  'networking',
  'networking concepts',
  'networking principles',
  'networking tcp ip',
  'networking technologies',
  'osi model',
  'routers hubs',
  'routing firewalls',
  'security fundamentals',
  'security networking',
  'tcp ip dns dhcp',
  'tcp ip network',
  'tcp ip networking',
  'tcp ip protocol',
  'tcp ip routing'},
 {'coordinated',
  'ensure testing',
  'held accountable',
  'production support teams',
  'program team',
  'project testing',
  'qa manager',
  'qa resources',
  'qa team',
  'team leader',
  'test efforts',
  'test teams',
  'testing efforts',
  'testing team',
  'tracking issues'},
 {'adobe',
  'adobe analytics',
  'create custom',
  'sitecatalyst',
  'web analytics'},
 {'dm',
  'material management',
  'mrp',
  'plant maintenance',
  'production planning'},
 {'continual service improvement',
  'incident tickets',
  'managing it',
  'request fulfillment',
  'service management itsm'},
 {'administering windows',
  'basic windows',
  'client operating',
  'current versions',
  'desktop operating systems',
  'filemaker',
  'intermediate windows',
  'mac',
  'mac operating',
  'mac os',
  'mac os x',
  'mac osx',
  'macintosh',
  'maintaining windows',
  'managing windows',
  'microsoft desktop',
  'microsoft operating',
  'microsoft operating systems',
  'microsoft os',
  'microsoft windows',
  'microsoft windows desktop',
  'microsoft windows operating',
  'ms server',
  'ms windows',
  'operating systems microsoft',
  'os windows',
  'os x',
  'osx',
  'pc operating',
  'platforms windows',
  'server linux',
  'server operating system',
  'server operating systems',
  'server windows',
  'supporting windows',
  'technologies windows',
  'troubleshooting microsoft',
  'troubleshooting windows',
  'win7',
  'windows',
  'windows 7',
  'windows desktop',
  'windows desktop operating',
  'windows environment',
  'windows mac',
  'windows microsoft',
  'windows network',
  'windows nt',
  'windows operating system',
  'windows operating systems',
  'windows platforms',
  'windows system administration',
  'windows xp',
  'windows xp windows',
  'xp windows'},
 {'acceleration',
  'ace',
  'acs',
  'appliances',
  'balancer',
  'balancers',
  'big ip',
  'bigip',
  'blue coat',
  'citrix netscaler',
  'f5',
  'f5 big ip',
  'f5 load',
  'f5 load balancing',
  'f5 ltm',
  'gateway',
  'gtm',
  'health monitoring',
  'imperva',
  'infoblox',
  'load balancer',
  'load balancers',
  'load balancing',
  'ltm',
  'ltm gtm',
  'network firewalls',
  'network load',
  'prior hands',
  'proxies',
  'proxy',
  'proxy servers',
  'reverse',
  'server load',
  'server load balancing',
  'ssl certificates',
  'ssl vpn',
  'tacacs',
  'vip',
  'waf',
  'web application firewall',
  'websense'},
 {'client projects',
  'company working',
  'consumer product',
  'development manager',
  'mobile product',
  'played'},
 {'center security',
  'deployment strategies',
  'designs solutions',
  'directional',
  'lead efforts',
  'mscs',
  'network equipment'},
 {'basking ridge', 'eden', 'franklin', 'managment', 'resides'},
 {'articulate',
  'complex business',
  'it team',
  'technical terms',
  'translate business requirements',
  'translate functional requirements',
  'users it'},
 {'iso iec', 'soc', 'ssae', 'ssae16'},
 {'production support team'},
 {'advanced degrees',
  'b.s',
  'bachelor',
  'bachelors',
  'bs',
  'bs computer science',
  'bs cs',
  'bs degree',
  'bs ms',
  'bs ms degree',
  'computer engineering',
  'computer engineering electrical engineering',
  'computer science engineering',
  'cs',
  'cs ee',
  'degree',
  'directly relevant',
  'educational requirements',
  'ee',
  'ee cs',
  'electrical engineering computer science',
  'engineer level',
  'engineering computer science',
  'engineering degree',
  'equivalent knowledge',
  'ged',
  'higher degree',
  'hs',
  'hs diploma',
  'it computer science',
  'lieu',
  'master degree',
  'masters',
  'masters degree',
  'ms',
  'ms degree',
  'phd',
  'progressive it',
  'science computer science',
  'science degree',
  'similar technical',
  'substituted',
  'technical degree',
  'technical field',
  'typical minimum education'},
 {'archives',
  'company web',
  'content',
  'content management system cms',
  'digital assets',
  'sharepoint sites',
  'web platform',
  'web presence',
  'web site'},
 {'access powerpoint',
  'applications excel',
  'excel access powerpoint',
  'excellent computer',
  'literate',
  'microsoft office applications',
  'microsoft office software',
  'microsoft word',
  'microsoft word excel powerpoint',
  'ms office applications',
  'ms word excel',
  'outlook excel',
  'outlook word',
  'powerpoint outlook',
  'powerpoint project',
  'use microsoft',
  'word excel',
  'word excel access',
  'word excel powerpoint access',
  'word outlook'},
 {'climate',
  'frequent',
  'indoor',
  'normally',
  'office environment',
  'office setting',
  'standard office'},
 {'area',
  'chester',
  'corporate headquarters',
  'corporate office',
  'grand',
  'metro area',
  'mill',
  'mills',
  'san',
  'san mateo',
  'suburban',
  'toronto',
  'twin'},
 {'active current',
  'active dod',
  'active secret',
  'active secret clearance',
  'clearance northrop grumman',
  'cleared',
  'current active',
  'current dod',
  'current secret',
  'given preferential consideration',
  'interim secret',
  'sar',
  'sci',
  'sci clearance',
  'secret',
  'secret sci',
  'ssbi',
  'ts',
  'ts sci w'},
 {'code design', 'code written', 'procs', 'sequences', 'troubleshoot sql'},
 {'adherence',
  'appropriateness',
  'approving',
  'assurance activities',
  'assurance reviews',
  'assurance team',
  'audits',
  'checklist',
  'checklists',
  'conduct internal',
  'conduct quality',
  'contract requirements',
  'control documentation',
  'customer specifications',
  'defined processes',
  'documented',
  'ensure adherence',
  'ensure compliance',
  'impact assessments',
  'management function',
  'management procedures',
  'management reviews',
  'overall quality',
  'perform quality',
  'performing quality',
  'performs quality',
  'periodic reviews',
  'preliminary',
  'project process',
  'project reviews',
  'quality checks',
  'quality control',
  'quality requirements',
  'quality review',
  'quality reviews',
  'readiness reviews',
  'review',
  'review approval',
  'review documentation',
  'review processes',
  'review technical',
  'reviews',
  'systems designs',
  'technical accuracy',
  'technical review',
  'testing procedures',
  'walk throughs'},
 {'built relationships',
  'contacted',
  'dynamics ax',
  'dynamics crm',
  'dynamics gp',
  'dynamics nav',
  'employers',
  'i',
  'market i',
  'microsoft dynamics market',
  'placing',
  'specializing solely',
  'unrivaled'},
 {'accomplish goals',
  'accomplish tasks',
  'communicate findings',
  'complex situations',
  'decision making process',
  'decisions',
  'decisive',
  'discretion',
  'excellent judgment',
  'exercise good',
  'exercise independent judgment',
  'exercise sound',
  'good judgment',
  'judgement',
  'judgment',
  'judgments',
  'problem solving troubleshooting',
  'relying',
  'situational',
  'solves problems',
  'sound decisions',
  'sound judgment'},
 {'create process',
  'data process',
  'develop process',
  'document process',
  'gained',
  'maps',
  'procedural',
  'process mapping',
  'process maps',
  'use knowledge'},
 {'agile testing',
  'atdd',
  'bdd',
  'behavior driven',
  'cucumber',
  'gherkin',
  'pair',
  'tdd',
  'tdd bdd',
  'tdd test driven',
  'test driven'},
 {'day maintenance', 'day operation', 'monitoring maintenance'},
 {'c c++ programming',
  'device driver',
  'device drivers',
  'embedded c',
  'embedded linux',
  'embedded real',
  'embedded software',
  'embedded systems',
  'qnx',
  'rtos',
  'vxworks',
  'wind'},
 {'advanced software',
  'flight test',
  'missile',
  'operationally',
  'operator',
  'payload',
  'telemetry',
  'weapon',
  'weapons'},
 {'11i', 'oracle r12', 'r12'},
 {'build consensus',
  'business groups',
  'business it',
  'business partners',
  'business sponsors',
  'business stakeholders',
  'business technical',
  'client stakeholders',
  'cross functional business',
  'cross functional groups',
  'cross functional teams',
  'facilitate discussions',
  'influence',
  'internal stakeholders',
  'internal teams',
  'it business',
  'it teams',
  'leaders',
  'organization',
  'partners',
  'sponsors',
  'stakeholders',
  'teams',
  'technology stakeholders'},
 {'branching',
  'branching merging',
  'build deploy',
  'build process',
  'build release',
  'code deployment',
  'code management',
  'code repository',
  'cvs',
  'deployment processes',
  'environment configuration',
  'management build',
  'practices tools',
  'repositories',
  'revision control',
  'source code',
  'source code control',
  'source code management',
  'source control',
  'support tools',
  'version control',
  'versioning'},
 {'daily weekly monthly',
  'data accuracy',
  'ensure accurate',
  'monitor',
  'monitor analyze',
  'monitor daily',
  'nightly',
  'perform daily',
  'perform general',
  'perform regular',
  'system setup'},
 {'actionable',
  'complicated',
  'consumable',
  'crisp',
  'derive',
  'distill',
  'research findings',
  'translate complex',
  'translate those'},
 {'business consultant',
  'enterprise risk management',
  'information security risk management',
  'internal audit',
  'it audit',
  'it auditor',
  'it compliance',
  'it internal',
  'it risk',
  'it risk management',
  'management industry',
  'public accounting',
  'risk analyst',
  'technology risk management'},
 {'definitely',
  'direct contact',
  'emails',
  'frank',
  'mailing',
  'nigel',
  'receiving',
  'ref',
  'removed',
  'reply'},
 {'cloud computing',
  'cutting edge technologies',
  'data centers',
  'deliver highly',
  'delivering highly',
  'devops team',
  'different technology',
  'dynamically',
  'edge cloud',
  'efficient delivery',
  'integrated systems',
  'scale',
  'scaling',
  'services platform'},
 {'graphic designers', 'software web', 'web designers'},
 {'academic',
  'applied',
  'behavioral',
  'bioinformatics',
  'biological',
  'business analytics',
  'business marketing',
  'data science',
  'fields',
  'genomics',
  'pursuing',
  'science technology',
  'similarly',
  'technical communications'},
 {'abap',
  'application integration',
  'custom data',
  'es',
  'product configuration',
  'sap abap',
  'sap erp',
  'sap pi po',
  'sap portal',
  'sap workflow',
  'technical configuration',
  'workbench'},
 {'css3 html5'},
 {'attainment',
  'business sales',
  'client relationship',
  'exceed',
  'exceeded',
  'exceeding',
  'monthly quarterly',
  'quota',
  'quotas',
  'sales goals',
  'sales pipeline',
  'sales strategies',
  'sales targets',
  'targets'},
 {'additional information',
  'attach',
  'attachment',
  'brown',
  'current',
  'email address',
  'emailing',
  'inquire',
  'mention',
  'mentioned',
  'ms word format',
  'submit resumes'},
 {'hadoop administrator',
  'sql database administrator',
  'sql dba',
  'sql server dba',
  'sql server developer'},
 {'.net',
  '.net html',
  '.net programming',
  '.net sql',
  '.net sql server',
  '.net wpf',
  'asp .net',
  'asp asp.net',
  'asp.net visual',
  'c#',
  'c# programming',
  'c# vb.net asp.net',
  'c# visual',
  'com',
  'csharp',
  'css sql',
  'dot net',
  'foxpro',
  'javascript c#',
  'microsoft visual',
  'ms visual',
  'server .net',
  'server visual',
  'sql .net',
  'sql c#',
  'sql sql server',
  'vb.net',
  'vb6',
  'vc++',
  'visual basic .net',
  'visual c++',
  'visual studio .net',
  'visual studio sql'},
 {'project manager', 'team lead'},
 {'concepts principles',
  'it environments',
  'principles practices',
  'process procedures',
  'standard it',
  'technical platforms'},
 {'base sas',
  'business intelligence systems',
  'complex reporting',
  'data management',
  'dwh',
  'ets',
  'sas',
  'sas data',
  'sas di',
  'sas enterprise',
  'solutions leveraging',
  'sql business',
  'studio',
  'unica'},
 {'single page application',
  'single page applications',
  'single page web',
  'spa'},
 {'bleeding',
  'continually',
  'decrease',
  'drive improvements',
  'drive process',
  'efficiencies',
  'expedite',
  'identify process',
  'identify recommend',
  'implement improvements',
  'implement process',
  'implement process improvements',
  'implementing process',
  'improving processes',
  'increase customer',
  'intervention',
  'minimizes',
  'productivity quality',
  'recommend design',
  'reduce',
  'reduce costs',
  'share best practices',
  'standardize',
  'streamline',
  'streamline processes'},
 {'appropriate resources',
  'escalate problems',
  'established procedures',
  'resolve incidents'},
 {'delivery models', 'iaas', 'saas paas'},
 {'database engineering',
  'databases',
  'my sql',
  'mysql database',
  'mysql databases',
  'mysql oracle',
  'mysql postgres',
  'mysql postgresql',
  'postgres',
  'postgresql',
  'rdms',
  'relational databases sql',
  'sql databases'},
 {'auditor cisa',
  'casp',
  'ccna security',
  'ccnp security',
  'ccsp',
  'ceh',
  'certification cissp',
  'certifications cissp',
  'certified ethical hacker',
  'certified information',
  'certified information security manager',
  'certified information systems auditor',
  'certified information systems security',
  'cism',
  'cism cisa',
  'cism cissp',
  'cissp certified information',
  'cissp cisa',
  'cissp giac',
  'gsec',
  'isc2',
  'oscp',
  'sans giac',
  'security+ ce',
  'sscp'},
 {'latest versions', 'manager oracle', 'oracle ibm', 'pack', 'product suite'},
 {'computer security incident',
  'emergency response',
  'incident response team',
  'preparedness',
  'response team'},
 {'functional specs', 'spec'},
 {'active directory dns',
  'hyperv',
  'microsoft hyper v',
  'networking storage',
  'sccm scom',
  'server build',
  'server builds',
  'server vmware',
  'software stack',
  'vmware environment',
  'vmware esx',
  'windows server administration'},
 {'coordinate', 'oversee', 'plan coordinate', 'support ongoing'},
 {'application vulnerability',
  'assessment',
  'assessment reports',
  'assessments',
  'conducting security',
  'continuous monitoring',
  'identify security',
  'network vulnerability',
  'penetration tests',
  'perform network',
  'perform periodic',
  'perform risk',
  'perform security',
  'perform vulnerability',
  'performing security',
  'performing vulnerability',
  'poa&m',
  'policy compliance',
  'remediating',
  'remediation',
  'remediation activities',
  'remediation efforts',
  'review security',
  'scanning',
  'scans',
  'security analysis',
  'security assessment',
  'security assessments',
  'security audits',
  'security issues',
  'security vulnerability',
  'support security',
  'vulnerability assessments',
  'vulnerability scanning',
  'vulnerability scans',
  'vulnerability testing'},
 {'applications utilizing',
  'oracle application developer',
  'testing maintenance'}]

Dump Clusters to File for Later Analysis


In [106]:
with open(CLUSTERS_FILE, "w+") as f:
    for lbl, words in lbl2cluster.items():
        f.write(str(lbl) + "|")
        line = ",".join(sorted(words))
        f.write(line + "\n")